#!/usr/bin/env python3
'''
Author: Paschalis Agapitos
Project: Mestizajes

Prepares embedding corpora from Wikipedia graph data by loading link
structures and biography metadata, mapping node identifiers to article titles,
filtering biographies by field of human activity and century range, and
building century-level link documents suitable for downstream embedding and
analysis pipelines.
'''

import pandas as pd
from pathlib import Path
from connecting_people.preprocessing.get_century import extract_year, get_centuries

def load_and_prepare_graph(base_dir):
    """
    Load the articles from the English Wikipedia dump and prepare the graph with labels.
    """
    base_dir = Path(base_dir)
    
    # Load only the needed columns from the graph
    enwiki_20252301_graph = pd.read_parquet(
        base_dir / "en/graph/en_wiki_graph.parquet", 
        engine="pyarrow",
        columns=["Source", "Target"]  # Only load these columns
    )
    
    # Load only needed columns from the mapping
    id_to_label_df = pd.read_parquet(
        base_dir / "en/graph/en_id_node_mapping.parquet",
        engine="pyarrow",
        columns=["id", "label"]  # Only load these columns
    )
    
    id_to_label_df = dict(zip(id_to_label_df["id"], id_to_label_df["label"]))
    enwiki_20252301_graph["Source"] = enwiki_20252301_graph["Source"].map(id_to_label_df)
    enwiki_20252301_graph["Target"] = enwiki_20252301_graph["Target"].map(id_to_label_df)
    
    return enwiki_20252301_graph

def load_and_prepare_metadata(base_dir):
    base_dir = Path(base_dir)
    
    needed_columns = ["Source", "field_of_human_activity", "COB_S"]
    
    # Define optimal data types
    dtype_spec = {
        "Source": "category",  # Good for string columns with repeated values
        "field_of_human_activity": "category",
        "COB_S": "int8"  # or whatever type is appropriate for century data
    }
    
    humans_metadata = pd.read_csv(
        base_dir / "connecting_people/experiments_filters/exp3/df_final2.csv",
        encoding="utf-8",
        usecols=needed_columns,
        dtype=dtype_spec
    )
    humans_metadata = humans_metadata.drop_duplicates()
    
    return humans_metadata

def filter_biographies_by_activity(enwiki_graph, humans_metadata, field_of_activity="Science"):
    """
    Filter biographies by the specified field of human activity.
    """
    humans_names = set(humans_metadata["Source"].unique())
    biograhies_as_source = enwiki_graph[enwiki_graph["Source"].isin(humans_names)]
    biograhies_as_source_with_metadata = biograhies_as_source.merge(
        humans_metadata[["Source", "field_of_human_activity", "COB_S"]].rename(columns={"COB_S": "century_of_birth"}),
        left_on="Source",
        right_on="Source"
    )
    # biographies_as_source_with_metadata = biograhies_as_source_with_metadata.dropna(subset=["date_of_birth"])
    # biographies_as_source_with_metadata = biographies_as_source_with_metadata[~biographies_as_source_with_metadata["date_of_death"].isin(["None", "not date"])]
    
    # fix extract year function
    # biograhies_as_source_with_metadata["century_of_birth"] = biograhies_as_source_with_metadata["date_of_birth"].apply(extract_year).apply(get_centuries)
    biograhies_as_source_with_metadata = biograhies_as_source_with_metadata[["Source", "century_of_birth", "field_of_human_activity", "Target"]]
    biograhies_as_source_with_metadata["field_of_human_activity"] = (
        biograhies_as_source_with_metadata["field_of_human_activity"]
        .astype(str)
        .str.split(",")
        .apply(lambda lst: [x.strip() for x in lst])
    )

    # Filter by the specified field of human activity
    mask_activity = biograhies_as_source_with_metadata["field_of_human_activity"].apply(lambda x: field_of_activity in x)
    biograhies_as_source_with_metadata_filtered = biograhies_as_source_with_metadata[mask_activity].copy()

    # Now explode safely, since only relevant rows are kept
    biograhies_as_source_with_metadata_exploded = (
        biograhies_as_source_with_metadata_filtered.explode("field_of_human_activity")
    )

    # Continue as before
    biograhies_as_source_with_metadata_exploded = (
        biograhies_as_source_with_metadata_exploded[
            (biograhies_as_source_with_metadata_exploded["century_of_birth"] >= -5) &
            (biograhies_as_source_with_metadata_exploded["century_of_birth"] < 21)
        ]
    )

    # Only keep rows with the specified field
    filtered_by_field = biograhies_as_source_with_metadata_exploded[
        biograhies_as_source_with_metadata_exploded["field_of_human_activity"] == field_of_activity
    ]

    return filtered_by_field

# def exclude_specific_names(filtered_data, exclude_names=None):
#     """
#     Exclude specific names from the data.
#     """
#     if exclude_names is None:
#         exclude_names = ["Lu Zhi (Han dynasty)", "Cheng Bing", "Xun Yue"]
    
#     filtered_data = filtered_data[~filtered_data["Source"].isin(exclude_names)]
#     return filtered_data

def create_docs_per_century(filtered_data):
    """
    Create a dictionary where the keys are the centuries (docs) and the values are lists of links (Target).
    """
    docs_per_century = filtered_data.groupby(by="century_of_birth")["Target"].apply(list).to_dict()
    return docs_per_century

def process_pipeline(base_dir, field_of_activity="Science", output_path=None, exclude_names=None):
    """
    Main pipeline function that processes the data with the specified field of human activity.
    
    Parameters:
    - base_dir: Base directory path (string or Path object)
    - field_of_activity: Field of human activity to filter by (default: "Science")
    - output_path: Path to save the output parquet file (optional)
    """
    base_dir = Path(base_dir)  # Convert to Path object if it's a string
    
    # Step 1: Load and prepare graph
    enwiki_graph = load_and_prepare_graph(base_dir)
    
    # Step 2: Load and prepare metadata
    humans_metadata = load_and_prepare_metadata(base_dir)
    
    # Step 3: Filter biographies by activity
    filtered_data = filter_biographies_by_activity(enwiki_graph, humans_metadata, field_of_activity)
    
    # Step 4: Exclude specific names
    # filtered_data = exclude_specific_names(filtered_data, exclude_names)
    
    # Step 5: Create docs per century dictionary
    docs_per_century = create_docs_per_century(filtered_data)
    
    # Step 6: Save to parquet if output path is provided
    if output_path:
        output_path = Path(output_path)  # Convert to Path object if it's a string
        output_path.mkdir(parents=True, exist_ok=True)  # Create directory if it doesn't exist
        field_clean = field_of_activity.replace(" ", "_").replace("&", "and").lower()
        output_file = output_path / f"{field_clean}_as_source2.parquet"
        filtered_data.to_parquet(output_file, engine="pyarrow", compression="gzip")
    
    return filtered_data, docs_per_century
